library(dplyr)
library(forcats)
library(ggplot2)
library(plotly)
library(broom)
library(boot)
library(caret)
library(pROC)
library(knitr)
library(MASS)
library(data.table)Project Part 2
Group 9
Loading libraries
Loading dataset
# dataset <- read.csv("SMARTc.csv", sep = ";") # Without missing values
dataset <- fread("SMARTc.csv", sep = ";") # Much fasterRe-encode the categorical variables
dataset <- mutate(dataset,
EVENT = factor(EVENT),
EVENT = fct_recode(EVENT, "no" = "0", "yes" = "1"),
SEX = factor(SEX),
SEX = fct_recode(SEX, "male" = "1", "female" = "2"),
DIABETES = factor(DIABETES),
DIABETES = fct_recode(DIABETES, "no" = "0", "yes" = "1"),
SMOKING = factor(SMOKING),
SMOKING = fct_recode(SMOKING, "never" = "1", "former" = "2", "current" = "3"),
alcohol = factor(alcohol),
alcohol = fct_recode(alcohol, "never" = "1", "former" = "2", "current" = "3"),
CEREBRAL = factor(CEREBRAL),
CEREBRAL = fct_recode(CEREBRAL, "no" = "0", "yes" = "1"),
CARDIAC = factor(CARDIAC),
CARDIAC = fct_recode(CARDIAC, "no" = "0", "yes" = "1"),
AAA = factor(AAA),
AAA = fct_recode(AAA, "no" = "0", "yes" = "1"),
PERIPH = factor(PERIPH),
PERIPH = fct_recode(PERIPH, "no" = "0", "yes" = "1"),
albumin = factor(albumin),
albumin = fct_recode(albumin, "no" = "1", "micro" = "2", "macro" = "3"),
STENOSIS = factor(STENOSIS),
STENOSIS = fct_recode(STENOSIS, "no" = "0", "yes" = "1"),
)Logistic regression model of EVENT
Accessing performance using cross-validation
k <- 10
set.seed(123)
folds <- createFolds(dataset$EVENT, k = k, list = TRUE, returnTrain = FALSE)
roc_list <- list()
auc_values <- numeric(k)
for (i in 1:k) {
train <- dataset[-folds[[i]], ]
test <- dataset[folds[[i]], ]
fit_train <- glm(EVENT ~ AGE + SEX + BMI + SYSTH + HDL + DIABETES +
HISTCAR2 + HOMOC + log(CREAT) + STENOSIS + IMT + SMOKING +
alcohol + albumin, data = train, family = "binomial")
predict_test <- predict(fit_train, newdata = test, type = "response")
roc_i <- roc(test$EVENT, predict_test)
roc_list[[i]] <- roc_i
auc_values[i] <- auc(roc_i)
}display code to plot the ROC curves
roc_df <- do.call(rbind, lapply(1:length(roc_list), function(i) {
data.frame(
Fold = paste("Fold", i),
Sensitivity = roc_list[[i]]$sensitivities,
Specificity = 1 - roc_list[[i]]$specificities
)
}))
roc_plot <- ggplot(roc_df, aes(x = Specificity, y = Sensitivity, color = Fold)) +
geom_line(linewidth = 0.8) +
scale_color_brewer(palette = "Set3") +
theme_minimal(base_size = 14) +
labs(
title = "ROC Curves for Each Fold",
x = "1 - Specificity",
y = "Sensitivity",
color = "Fold"
) +
theme(
plot.title = element_text(hjust = 0.5, size = 15, face = "bold"),
legend.position = "bottom"
) +
coord_equal()
ggplotly(roc_plot)auc_table <- data.frame(Iteration = 1:k, AUC = auc_values)
mean_auc <- mean(auc_values)
auc_table$AUC <- round(auc_table$AUC, 4)
mean_auc <- round(mean_auc, 4)
auc_table <- rbind(auc_table, c("AUC mean", mean_auc))
kable(auc_table, caption = "AUC Values for 10-Fold Cross-Validation")| Iteration | AUC |
|---|---|
| 1 | 0.7331 |
| 2 | 0.6542 |
| 3 | 0.7895 |
| 4 | 0.745 |
| 5 | 0.6944 |
| 6 | 0.7877 |
| 7 | 0.7776 |
| 8 | 0.7044 |
| 9 | 0.7471 |
| 10 | 0.7574 |
| AUC mean | 0.7391 |
(auc_confidence_interval = t.test(auc_values)$conf.int)[1] 0.7077229 0.7703818
attr(,"conf.level")
[1] 0.95
This means that we can be 95% confident that the true AUC value lies between 0.7077 and 0.7704.
Calibration model
Assessing the calibration using a calibration plot
# df_copy <- mutate(dataset,
# EVENT = fct_recode(EVENT, "0" = "no", "1" = "yes"),
# )
observed_outcomes <- as.numeric(test$EVENT) - 1
calibration_data <- data.frame(Predicted = predict_test, Observed = observed_outcomes)display code to plot the calibration plot
calibration_plot <- ggplot(calibration_data, aes(x = Predicted, y = Observed)) +
geom_smooth(method = "loess", se = FALSE, formula = y ~ x, color = "#3ca7c7") +
geom_point(size = 2, alpha = 0.5) +
geom_abline(intercept = 0, slope = 1, linetype = "dashed",color = "#ff8800") +
labs(
x = "Predicted Probability",
y = "Observed Probability",
title = "Calibration Plot"
) +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(hjust = 0.5, size = 15, face = "bold"))
ggplotly(calibration_plot)The orange dashed line is the ideal calibration line, where the predicted probabilities perfectly match the observed outcomes. The blue curve shows the actual calibration of the model — how the predictions from the model align with the observed data.
For the low predicted probabilities (0.0 to 0.1), the blue curve is relatively flat and near the dashed line, indicating that the model is well-calibrated for low-probability predictions. For the moderate predicted probabilities (0.1 to 0.3), the blue curve deviates from the orange dashed line and rises faster than the ideal line. This suggests overprediction in this range : the model tends to predict higher probabilities than are observed in reality. For higher predicted probabilities (above 0.3), the blue curve is consistently above the orange line, indicating a general overestimation of risk. This means that when the model predicts a higher probability of an event, the actual frequency is lower than what is predicted. This overestimation can lead to false positives, where the model predicts events that don’t actually occur as often as expected.
A well calibrated model is useful because it provides accurate probabilities that can be used to make informed decisions. If a model is poorly calibrated, the predicted probabilities may not accurately reflect the true likelihood of an event, which can lead to suboptimal decisions. In this case, the model tends to overestimate the risk of having a cardiovascular event, which could lead to unnecessary interventions or treatments.
Performance comparison between the logistic regression model and a LDA model
k <- 10
folds <- createFolds(dataset$EVENT, k = k, list = TRUE, returnTrain = FALSE)
auc_values_glm <- numeric(k)
auc_values_lda <- numeric(k)
brier_scores_glm <- numeric(k)
brier_scores_lda <- numeric(k)
fit_formula <- EVENT ~ AGE + SEX + BMI + SYSTH + HDL + DIABETES +
HISTCAR2 + HOMOC + log(CREAT) + STENOSIS + IMT + SMOKING +
alcohol + albumin
for (i in 1:k) {
train <- dataset[-folds[[i]], ]
test <- dataset[folds[[i]], ]
fit_train <- glm(fit_formula, data = train, family = "binomial")
lda_train <- lda(fit_formula, data = train)
predict_test_glm <- predict(fit_train, newdata = test, type = "response")
predict_test_lda <- predict(lda_train, newdata = test)$posterior[, 2]
roc_glm <- roc(test$EVENT, predict_test_glm)
roc_lda <- roc(test$EVENT, predict_test_lda)
auc_glm <- auc(roc_glm)
auc_lda <- auc(roc_lda)
auc_values_glm[i] <- auc_glm
auc_values_lda[i] <- auc_lda
test$EVENT_numeric <- ifelse(test$EVENT == "yes", 1, 0)
brier_scores_glm[i] <- mean((predict_test_glm - test$EVENT_numeric)^2)
brier_scores_lda[i] <- mean((predict_test_lda - test$EVENT_numeric)^2)
}auc_table <- data.frame(
Iteration = 1:k,
AUC_GLM = auc_values_glm,
AUC_LDA = auc_values_lda
)
mean_auc_glm <- mean(auc_values_glm)
mean_auc_lda <- mean(auc_values_lda)
auc_table <- rbind(auc_table, c("AUC mean", mean_auc_glm, mean_auc_lda))
kable(auc_table, caption = "AUC Values for 10-Fold Cross-Validation")| Iteration | AUC_GLM | AUC_LDA |
|---|---|---|
| 1 | 0.689213311232947 | 0.680606910620936 |
| 2 | 0.694632156062731 | 0.681308172893026 |
| 3 | 0.757618258319521 | 0.746716817544307 |
| 4 | 0.787836287135025 | 0.786816269284712 |
| 5 | 0.830854309687262 | 0.833015509788965 |
| 6 | 0.754876960346806 | 0.761379574142548 |
| 7 | 0.733329083258957 | 0.736261634578605 |
| 8 | 0.755960729312763 | 0.760550809639169 |
| 9 | 0.670099160945843 | 0.672260361047546 |
| 10 | 0.720760233918129 | 0.719171116196288 |
| AUC mean | 0.739518049021998 | 0.73780871757361 |
brier_scores_table <- data.frame(
Iteration = 1:k,
Brier_Score_GLM = brier_scores_glm,
Brier_Score_LDA = brier_scores_lda
)
mean_brier_score_glm <- mean(brier_scores_glm)
mean_brier_score_lda <- mean(brier_scores_lda)
brier_scores_table <- rbind(
brier_scores_table,
c("Brier Score mean", mean_brier_score_glm, mean_brier_score_lda)
)
kable(brier_scores_table, caption = "Brier Scores for 10-Fold Cross-Validation")| Iteration | Brier_Score_GLM | Brier_Score_LDA |
|---|---|---|
| 1 | 0.0980693094964066 | 0.101151527699879 |
| 2 | 0.0957994926276176 | 0.0992645798727123 |
| 3 | 0.0937325644094975 | 0.0975508816258736 |
| 4 | 0.0934333213044997 | 0.0949895733456367 |
| 5 | 0.0856124284059258 | 0.0841449827380715 |
| 6 | 0.0953253710852003 | 0.0956756178111479 |
| 7 | 0.0930190550803127 | 0.0935793549109133 |
| 8 | 0.0965181691367765 | 0.0976024186782206 |
| 9 | 0.0965950257935387 | 0.0975672443459604 |
| 10 | 0.0980470551266242 | 0.101430931284722 |
| Brier Score mean | 0.09461517924664 | 0.0962957112313137 |
(auc_confidence_interval_glm <- t.test(auc_values_glm)$conf.int)[1] 0.7047454 0.7742907
attr(,"conf.level")
[1] 0.95
(auc_confidence_interval_lda <- t.test(auc_values_lda)$conf.int)[1] 0.7010771 0.7745404
attr(,"conf.level")
[1] 0.95
This means that we can be 95% confident that the true AUC value for the logistic regression model lies between 0.7047 and 0.7743, and for the LDA model between 0.7011 and 0.7745.
(brier_score_confidence_interval_glm <- t.test(brier_scores_glm)$conf.int)[1] 0.09201474 0.09721562
attr(,"conf.level")
[1] 0.95
(brier_score_confidence_interval_lda <- t.test(brier_scores_lda)$conf.int)[1] 0.09275269 0.09983874
attr(,"conf.level")
[1] 0.95
This means that we can be 95% confident that the true Brier score for the logistic regression model lies between 0.0920 and 0.0972, and for the LDA model between 0.0928 and 0.0998.
Conclusion
The logistic regression model and the LDA model have similar performance in terms of AUC and Brier score. The logistic regression model has a slightly lower Brier score, indicating better calibration, but the difference is small. Both models have similar AUC values, indicating similar discrimination performance.